//Notes: 
//create a dataset of wtf ids then merge in the cos data with an indicator for whether it is foreign owned. 
//merge in employment data from BR for the ein to which the USID matched and for the firmid to which the USID matched from the crosswalk for foreign MNLs to firmid.  

//Inputs: 
//$cos/cos2007_wfirmid.dta //Comapny Organization Survey
//$input/affiliate_crosswalk_2007.dta //Crosswalk between USID and firmid (USID is for domestic affiliates of foreign MNLs)  This dataset has the match flags and the information on the firm and ein employment 
//$input/parent_crosswalk_2007.dta
//$output/affiliate_br_emp_shares.dta
//$output/bea_countries.dta
//$output/formnl_us_affiliates2007.dta //created by SAS_05_pull_formnl.sas
//$output/mnl_foreign_affiliates2007.dta //created by SAS_04_pull_mnl.sas

//Outputs:
//$bea/bea_inward_multiple_ids.dta
//$bea/bea_multiple_ids.dta
//$bea/conflict_foreign.dta
//$output/affiliate_br_emp_shares.dta
//$output/mne_status.dta


cd ""  /* PROJECT ROOT FOLDER */

global inputcw "input/Hannah/crosswalk"
global data  "data"
global junk  "data/junk"
global cos "input/cos" 

cap program drop combine_bea_cos
program define combine_bea_cos
  syntax , year(integer)

  //0.  Pull in the USID-Firmid crosswalk (the crosswalk between foreign MNLs and firmid) 

    if `year'==2007{
	import sas using  "$inputcw/affiliate_crosswalk_`year'.sas7bdat", clear
      	keep if missing(firmid) == 0

	
	replace ein_match_employment = 0 if ein_match_employment == . & ~missing(match_ein) //if we had a successful match and ein_match_employment is missing, that means that EIN employment in the BR is missing. 
	
	bys firmid: gen num_usids = _N
	bys firmid match_ein: gen ein_count = -_N
	bys firmid match_ein: replace ein_match_employment = . if _n>1 //when we collapse to firm level, don't want to double count ein emplolyment
	sort firmid ein_count  
      collapse (sum) ein_match_employment name_match=name ein_match=ein  ///
	(first) match_ein num_usids firm_match_employment firm_match_establishments, by(firmid)   //for the rest of the duplicates, sum up the employment at the foreign owned EINs 
      
      //make the employment share variablescd 
	gen foreign_emp_share = ein_match_employment / firm_match_employment 
	// HR: Don't think of num_usids as establishments. In a perfect world if USIDs represented a firm we would have a 1-1 mapping between USID and firmids in practice we have several USIDs that map to the same firmid. 
	* gen foreign_estab_share = num_usids / firm_match_establishments
	sum foreign_emp_share, d

      keep firmid foreign_emp_share name_match ein_match 
	save $data/affiliate_br_emp_shares`year'.dta, replace
    }
  **************************************


  //1. Bring in the Outward FDI data
  *************************************
  import sas using "$data/mnl_foreign_affiliates`year'.sas7bdat", clear
	keep if missing(firmid) == 0
  keep firmid bea_name aff_ein* aff_emp aff_sales aff_match_flag aff_ctry aff_naics foreign_id us_id
  rename aff_ctry countrycode
  merge m:1 countrycode using "$data/bea_countries.dta", keepusing(countrycode country) keep(master match) nogen //identify countries
  rename countrycode aff_ctry
  *************************************

  //2.  Collapse over foreign id (industry-country identifiers within a bea firmid)
  *************************************
    tab foreign_id
  collapse (sum) aff_emp aff_sales, by(firmid us_id bea_name aff_ein1 aff_ein2 aff_match_flag aff_ctry country)
  *************************************

  //3.  Count number of us_ids in outward data to a firmid
  *************************************
  /*
  preserve 
    collapse (sum) aff_sales, by(firmid us_id bea_name)
    gen count=1
    bys firmid: egen bea_count=sum(count)
    summ bea_count
    gen mm=1 if bea_count>1
    tab mm, miss
    *Output dataset for bea
    keep if bea_count>1
    save $bea/bea_multiple_ids.dta, replace
  restore  
  */
  *************************************   
    

  //4. Aggregate to the firm level
  *************************************
  //a. keep the most important country and firm name based on sales
    bys firmid: egen tot_aff_sales=sum(aff_sales)
    gen aff_sales_share=aff_sales/tot_aff_sales
    
    sort firmid aff_sales_share
    bys firmid: gen byte keep = (_n==_N)
    bys firmid: egen lowst_aff_match_flag=min(aff_match_flag)
    bys firmid us_id: gen count=1 if _n==1
    bys firmid: egen bea_count_outward=sum(count)
    drop count
    label variable bea_count_outward "No. of BEA us_ids in the OUT data for this firmid"
    
    keep if keep==1
    keep us_id firmid bea_name aff_ein? aff_match_flag aff_ctry country aff_emp aff_sales tot_aff_sales aff_sales_share lowst_aff_match_flag bea_count_outward
    describe 
    
    save $junk/temp1.dta, replace
  *************************************



  //5. Bring in the Inward FDI data
  *************************************
  use  if missing(firmid)==0 using "$data/formnl_us_affiliates`year'.dta", clear 
  ren Usid usid
  keep  firmid  in_ein* usid bea_name in_match_flag in_ctryparent in_state in_typeownership in_emp in_sales
  rename in_ctryparent countrycode
  rename bea_name bea_in_name
  if `year' ==2012 {
    replace countrycode="" if countrycode=="Majority"
    destring countrycode, replace
  }
  merge m:1 countrycode using "$data/bea_countries.dta", keepusing(countrycode country) keep(master match) nogen //identify countries
  rename countrycode in_ctryparent
  gen formnl_country=country
  *************************************
  
  
  //6.  Count number of us_ids in the inward data to a firmid
  *************************************
  preserve 
    collapse (sum) in_sales, by(firmid usid bea_in_name)
    gen count=1
    bys firmid: egen bea_count=sum(count)
    summ bea_count
    gen mm=1 if bea_count>1
    tab mm, miss
    *Output dataset for bea
    keep if bea_count>1
    save $data/bea_inward_multiple_ids.dta, replace
  restore  
  
  *************************************   
    
  //7.  Aggregate to the firm level 
  ************************************* 
    //a. keep the most important country and firm name based on sales
    bys firmid: egen tot_in_sales=sum(in_sales)
    gen in_sales_share=in_sales/tot_in_sales
    
    sort firmid in_sales_share
    bys firmid: gen keep=1 if _n==_N
    bys firmid: egen lowst_in_match_flag=min(in_match_flag)
    bys firmid usid: gen count=1 if _n==1
    bys firmid: egen bea_count_inward=sum(count)
    drop count
    label variable bea_count_inward "No. of BEA us_ids in the OUT data for this firmid"
    label variable usid "Inward US ID"
    keep if keep==1
    keep usid firmid bea_in_name in_ctryparent in_typeownership in_ein1 in_ein2  ///
	      in_emp in_sales country formnl_country lowst_in_match_flag bea_count_inward
  *************************************  
  
  //8.  Combine the Outward and Inward data
  *************************************
  merge 1:1 firmid using "$junk/temp1.dta"
  gen str status_bea="OUTWARD" if _merge==2
  replace status_bea="INWARD" if _merge==1
  replace status_bea="BOTH" if _merge==3
  drop _merge
  label variable status_bea "Status based on presence in BEA datasets" 
  save "$junk/temp2.dta", replace
  *************************************


  //9. Pull in the COS data
  *************************************

  //Load COS data
  //raw data is at the survunitid level, collapse to firm calculating share of estabs that are majority foreign owned 
  use if missing(firmid)==0 using "$cos/cos`year'_wfirmid.dta", clear
  duplicates report firmid 

  //JD quality checks block
  count //Does this number make sense?
  assert affil_forstock_ge10=="Y" if affil_forstock_ge10_yes=="X"
  tab affil_forstock_ge10 affil_forstock_ge10_no, missing  //Data quality check;
  tab affil_forstock_ge10 if affil_for_stock_eq50=="X" | affil_for_stock_51to99=="X" | affil_for_stock_eq100=="X", missing
  assert affil_for_ge10=="Y" if affil_for_ge10_yes=="X"
  tab affil_for_ge10 affil_for_ge10_no, missing  //Data quality check; 
  tab affil_for_ge10 if affil_for_stock_eq50=="X" | affil_for_stock_51to99=="X" | affil_for_stock_eq100=="X", missing  //Data quality check: 
  tab affil_for_ge10_yes if affil_for_stock_eq50=="X" | affil_for_stock_51to99=="X" | affil_for_stock_eq100=="X", missing
  tab affil_for_ge10 if affil_for_stock_10to24=="X" | affil_for_stock_25to49=="X", missing //Data quality check; f
  ***********************

  //10. Make COS variables to use for identification
  ***********************
  gen byte cos_foreign_owned = (affil_for_stock_eq50=="X" | affil_for_stock_51to99=="X" | affil_for_stock_eq100=="X")
  label variable cos_foreign "COS indicates majority foreign-owned"
  gen byte cos_fo_conflict = (cos_foreign_owned==1 & affil_for_ge10_no=="X")
    
  gen byte cos_owns_affils = (affil_forstock_ge10_yes=="X")
  label variable cos_owns_affils "COS says it owns foreign affiliates"
  ***********************


  //11. Collapse to firm level (no emp or other information here...) 
  ***********************
  gen enterprises=1 
    bysort firmid: egen ent_tot=sum(enterprises)
    tab ent_tot
    * browse if ent_tot>1
  keep cos_foreign_owned cos_fo_conflict cos_owns_affils cos_owns_affils  enterprise  firmid
  collapse (sum) cos* enterprise, by(firmid)
  foreach var in cos_foreign_owned cos_owns_affils {
    gen byte cc_`var' = (`var'~=enterprises & enterprises>1 & `var'>0)
    }
    * browse if cc_cos_for==1 | cc_cos_owns==1

  //Make COS indicators
  gen cos_status="BOTH" if inrange(cos_foreign_owned,1,.) & inrange(cos_owns_affils,1,.)
  replace cos_status="OUTWARD" if cos_foreign_owned==0 & inrange(cos_owns_affils,1,.)
  replace cos_status="INWARD" if inrange(cos_foreign_owned,1,.) & cos_owns_affils==0
  replace cos_status="DOM" if cos_foreign_owned==0 & cos_owns_affils==0 
  label variable cos_status "MNE status based on aggregation of COS enterprise statuses"
  save "$junk/temp_cos.dta", replace

  *************************

  //12. Combine COS, BEA data, BR emp data
  *************************
  use "$junk/temp_cos.dta", clear
  merge 1:1 firmid using $junk/temp2.dta
  tab _merge status_bea
  gen byte in_cos = (_merge==1 | _merge==3)
  gen byte in_bea = (_merge==2 | _merge==3)
  label variable in_cos "Firm appears in COS data"
  label variable in_bea "Firm appears in BEA data"
  replace cos_status="MISS" if _merge==2
  drop _merge

  tab cos_foreign_owned status_bea, miss
  tab cos_owns_affils status_bea, miss
  tab cos_status status_bea, miss
  tab cos_status status_bea if cos_status!="MISS", miss row nof  
  tab status_bea cos_status, miss row nof   
  *************************

  //13.  Bring in the BR employment data
  *************************
  if `year'==2007{
    merge 1:1 firmid using $data/affiliate_br_emp_shares`year'.dta
      
      *Examine the foreign-owned emp shares
      bys status_bea: summ foreign_emp_share
      bys cos_status: summ foreign_emp_share  
  }
  else {
    gen foreign_emp_share = .
  }
  **************************

  //14. Assign firm types
  *************************
    *RULES:
    *a) Assign as Foreign-owned if BEA data say foreign-owned only and no COS information or COS agrees
    capture drop mne_status
    gen mne_status="FOR" if status_bea=="INWARD" & (cos_status=="INWARD" | cos_status=="MISS" | cos_status=="BOTH")  
      tab mne_status
      
    *b)  Assign as Foreign-owned if BEA & COS says both or in
    replace mne_status="FOR" if status_bea=="BOTH" & (cos_status=="BOTH" | cos_status=="INWARD")
    
    *c) Assign as Foreign-owned if BEA data say foreign-owned and emp share is majority and no COS information or COS agrees 
    replace mne_status="FOR" if (status=="BOTH" & foreign_emp_share>0.49 & foreign_emp_share~=.) & (cos_status=="INWARD" | cos_status=="MISS" | cos_status=="BOTH")
   
    *d) How many firms that BEA says are IN and that COS says are DOM or OUT?  (528)
      gen count=1
      tab count if status_bea=="INWARD" & (cos_status=="DOM" | cos_status=="OUTWARD")
      tab count if status_bea=="INWARD" & (cos_status=="DOM" | cos_status=="OUTWARD") & (foreign_emp_share>0.49 & foreign_emp_share~=.)
      gen to_ignore=1 if status_bea=="INWARD" & (cos_status=="DOM" | cos_status=="OUTWARD") & (foreign_emp_share<=0.49 & foreign_emp_share~=.)
      
      *browse firmid cos_foreign_owned cos_fo_conflict bea_in_name bea_count_inward status_bea cos_status foreign_emp_share  ///
	*  if status_bea=="INWARD" & (cos_status=="DOM" | cos_status=="OUTWARD") & to_ignore==1
	
	preserve
	  keep  firmid cos_foreign_owned cos_fo_conflict bea_in_name bea_count_inward status_bea cos_status foreign_emp_share  to_ig
	  keep if status_bea=="INWARD" & (cos_status=="DOM" | cos_status=="OUTWARD") & to_ignore==1
	  save  $data/conflict_foreign_`year'.dta, replace
	restore
	
    *e) Assign as foreign-owned if BEA says IN and emp_share>.49
      replace mne_status="FOR" if status_bea=="INWARD" & (cos_status=="DOM" | cos_status=="OUTWARD") & (foreign_emp_share>0.49 & foreign_emp_share~=.)
      
    *f) Assign as US MNE if BEA says out & COS agrees or no info
      replace mne_status="MNE" if status_bea=="OUTWARD" &  (cos_status=="OUTWARD" | cos_status=="MISS" | cos_status=="BOTH")
	  tab status_bea mne_status, miss  

      
    *g) Assign all remaining US MNEs regardless of COS information
	replace mne_status="MNE" if status_bea=="OUTWARD"
	replace mne_status="MNE" if status_bea=="BOTH" & mne_status==""  
	tab mne_status, miss
	tab mne_status if in_bea==1, miss
	replace mne_status="FO2" if mne_status=="" & status_bea=="INWARD"  
	
    *h) Make a second mne_status variable for the missing bea ids flagged in the COS
      gen mne_status_supp=mne_status
	replace mne_status_supp="MNE" if in_bea==0 & cos_status=="OUTWARD"
	replace mne_status_supp="FOR" if in_bea==0 & cos_status=="INWARD"  
	tab mne_status_supp mne_status, miss
  ******************
  
    tab status_bea mne_status, miss
    tab cos_status mne_status if status_bea=="BOTH", miss
      //  All BEA inward is INWARD
      //  All BEA outward is OUTWARD
      //  BEA BOTH: split according to COS and emp shares
		//  if COS says both: then FOR
		//  if COS says DOM: then MNE
		//  if COS says IN: then FOR
		//  if COS says OUT: then MNE
		//  if COS says MISS: split (by foreign emp)

  //15.  Save an MNE type dataset
  ******************
  keep firmid in_bea in_cos cos_status mne_stat* status_bea formnl_country foreign_emp_share
  label variable status_bea "Status based on presence in BEA datasets"
  label variable mne_status "MNE status based on COS and BEA data"
  label variable mne_status_supp "MNE status with info for plants missing from BEA data"
  label variable formnl_country "Foreign firm parent country based on majority emp"




  gen int year = `year' 
		     
  label data "Firm-level MNE status in 2007 from COS and BEA, with BR emp info" 
  compress
  save $data/mne_status`year'.dta, replace

end //end program combine_bea_cos


  combine_bea_cos, year(2007)
 * combine_bea_cos, year(2012)


